#####################################################################################
# Project: Roll Call Votes in the European Parliament
#####################################################################################
  
  
  # Remove all objects
  # rm(list=ls());
  gc()
  
  dir.create("data/ep/", recursive = TRUE)

  # Load libraries
  x <- c("tidyverse", "readxl")
  
  lapply(x, require, character.only = TRUE)
  
  # set.seed(1917)

## Roll Call Data Frame ========
  # Download data
  download.file("https://personal.lse.ac.uk/hix/EP%20Data/rcv_ep1.zip", "data/ep/rcv_ep1.zip")
  download.file("https://personal.lse.ac.uk/hix/EP%20Data/rcv_ep2.zip", "data/ep/rcv_ep2.zip")
  download.file("https://personal.lse.ac.uk/hix/EP%20Data/rcv_ep3.zip", "data/ep/rcv_ep3.zip")
  download.file("https://personal.lse.ac.uk/hix/EP%20Data/rcv_ep4.zip", "data/ep/rcv_ep4.zip")
  download.file("https://personal.lse.ac.uk/hix/EP%20Data/ep5_rcv_11Jul06.zip", "data/ep/rcv_ep5_11Jul06.zip")
  download.file("https://personal.lse.ac.uk/hix/EP%20Data/mep_info_26Jul11.zip", "data/ep/mep_info_26Jul11.zip")
  
  unzip("data/ep/rcv_ep1.zip", exdir = "data/ep")
  unzip("data/ep/rcv_ep2.zip", exdir = "data/ep")
  unzip("data/ep/rcv_ep3.zip", exdir = "data/ep")
  unzip("data/ep/rcv_ep4.zip", exdir = "data/ep")
  unzip("data/ep/rcv_ep5_11Jul06.zip", exdir = "data/ep")
  unzip("data/ep/mep_info_26Jul11.zip", exdir = "data/ep")
  
  file.remove("data/ep/rcv_ep1.zip")
  file.remove("data/ep/rcv_ep2.zip")
  file.remove("data/ep/rcv_ep3.zip")
  file.remove("data/ep/rcv_ep4.zip")
  file.remove("data/ep/rcv_ep5_11Jul06.zip")
  file.remove("data/ep/mep_info_26Jul11.zip")
  
  # Read in data 
  rcv_ep1 <- read.delim("data/ep/rcv_ep1.txt", sep = ",")
  rcv_ep2 <- read.delim("data/ep/rcv_ep2.txt", sep = ",")
  rcv_ep3 <- read.delim("data/ep/rcv_ep3.txt", sep = ",")
  rcv_ep4 <- read.delim("data/ep/rcv_ep4.txt", sep = ",")
  rcv_ep5 <- read.delim("data/ep/rcv_ep5_11Jul06.txt", sep = ",")

  # Clean data set
  rcv_ep1[,-c(1:5)] <- apply(rcv_ep1[,-c(1:5)], 2, function(x) ifelse(x %in% c(0,3,4,5), NA, ifelse(x == 2, 0, x)))
  rcv_ep2[,-c(1:5)] <- apply(rcv_ep2[,-c(1:5)], 2, function(x) ifelse(x %in% c(0,3,4,5), NA, ifelse(x == 2, 0, x)))
  rcv_ep3[,-c(1:5)] <- apply(rcv_ep3[,-c(1:5)], 2, function(x) ifelse(x %in% c(0,3,4,5), NA, ifelse(x == 2, 0, x)))
  rcv_ep4[,-c(1:5)] <- apply(rcv_ep4[,-c(1:5)], 2, function(x) ifelse(x %in% c(0,3,4,5), NA, ifelse(x == 2, 0, x)))
  rcv_ep5[,-c(1:5)] <- apply(rcv_ep5[,-c(1:5)], 2, function(x) ifelse(x %in% c(0,3,4,5), NA, ifelse(x == 2, 0, x)))
  
  rcv_ep1 <- rcv_ep1[apply(rcv_ep1[,-c(1:5)], 1, function(x) sum(!(is.na(x))) > 10),]
  rcv_ep2 <- rcv_ep2[apply(rcv_ep2[,-c(1:5)], 1, function(x) sum(!(is.na(x))) > 10),]
  rcv_ep3 <- rcv_ep3[apply(rcv_ep3[,-c(1:5)], 1, function(x) sum(!(is.na(x))) > 10),]
  rcv_ep4 <- rcv_ep4[apply(rcv_ep4[,-c(1:5)], 1, function(x) sum(!(is.na(x))) > 10),]
  rcv_ep5 <- rcv_ep5[apply(rcv_ep5[,-c(1:5)], 1, function(x) sum(!(is.na(x))) > 10),]
  
  rcv_ep1 <- cbind(rcv_ep1[,c(1:5)],
                   rcv_ep1[,-c(1:5)][,apply(rcv_ep1[,-c(1:5)], 2, function(x) sum(!(is.na(x))) > 10)])
  rcv_ep2 <- cbind(rcv_ep2[,c(1:5)],
                   rcv_ep2[,-c(1:5)][,apply(rcv_ep2[,-c(1:5)], 2, function(x) sum(!(is.na(x))) > 10)])
  rcv_ep3 <- cbind(rcv_ep3[,c(1:5)],
                   rcv_ep3[,-c(1:5)][,apply(rcv_ep3[,-c(1:5)], 2, function(x) sum(!(is.na(x))) > 10)])
  rcv_ep4 <- cbind(rcv_ep4[,c(1:5)],
                   rcv_ep4[,-c(1:5)][,apply(rcv_ep4[,-c(1:5)], 2, function(x) sum(!(is.na(x))) > 10)])
  rcv_ep5 <- cbind(rcv_ep5[,c(1:5)],
                   rcv_ep5[,-c(1:5)][,apply(rcv_ep5[,-c(1:5)], 2, function(x) sum(!(is.na(x))) > 10)])
  
  rcv_ep1 <- cbind(rcv_ep1[,c(1:5)], 
                   rcv_ep1[,-c(1:5)][,{colMeans(rcv_ep1[,-c(1:5)], na.rm = T) < .95 & colMeans(rcv_ep1[,-c(1:5)], na.rm = T) > .05}])
  rcv_ep2 <- cbind(rcv_ep2[,c(1:5)], 
                   rcv_ep2[,-c(1:5)][,{colMeans(rcv_ep2[,-c(1:5)], na.rm = T) < .95 & colMeans(rcv_ep2[,-c(1:5)], na.rm = T) > .05}])
  rcv_ep3 <- cbind(rcv_ep3[,c(1:5)], 
                   rcv_ep3[,-c(1:5)][,{colMeans(rcv_ep3[,-c(1:5)], na.rm = T) < .95 & colMeans(rcv_ep3[,-c(1:5)], na.rm = T) > .05}])
  rcv_ep4 <- cbind(rcv_ep4[,c(1:5)], 
                   rcv_ep4[,-c(1:5)][,{colMeans(rcv_ep4[,-c(1:5)], na.rm = T) < .95 & colMeans(rcv_ep4[,-c(1:5)], na.rm = T) > .05}])
  rcv_ep5 <- cbind(rcv_ep5[,c(1:5)], 
                   rcv_ep5[,-c(1:5)][,{colMeans(rcv_ep5[,-c(1:5)], na.rm = T) < .95 & colMeans(rcv_ep5[,-c(1:5)], na.rm = T) > .05}])
  
  rcv_ep1 <- rcv_ep1[,c(1:5, sample(6:ncol(rcv_ep1), 500))]
  rcv_ep2 <- rcv_ep2[,c(1:5, sample(6:ncol(rcv_ep2), 500))]
  rcv_ep3 <- rcv_ep3[,c(1:5, sample(6:ncol(rcv_ep3), 500))]
  rcv_ep4 <- rcv_ep4[,c(1:5, sample(6:ncol(rcv_ep4), 500))]
  rcv_ep5 <- rcv_ep5[,c(1:5, sample(6:ncol(rcv_ep5), 500))]
  
## MP's Data Frame ========

  # Read in data
  mp_ep1 <- read_excel("data/ep/mep_info_26Jul11.xls", sheet = 2) %>% 
    mutate(session = 1)
  mp_ep2 <- read_excel("data/ep/mep_info_26Jul11.xls", sheet = 3) %>% 
    mutate(session = 2)
  mp_ep3 <- read_excel("data/ep/mep_info_26Jul11.xls", sheet = 4) %>% 
    mutate(session = 3)
  mp_ep4 <- read_excel("data/ep/mep_info_26Jul11.xls", sheet = 5) %>% 
    mutate(session = 4)
  mp_ep5 <- read_excel("data/ep/mep_info_26Jul11.xls", sheet = 6) %>% 
    mutate(session = 5)
  

  # Merge data sets by row
  mp_ep_all <- bind_rows(mp_ep1, mp_ep2, mp_ep3, mp_ep4, mp_ep5)
  
  
  # Clean data set
  mp_ep_final <- mp_ep_all %>% 
    rename(id = `MEP id`, country = `Member State`, party = `National Party`,
           party_group = `EP Group`, dim1 = `NOM-D1`,
           dim2 = `NOM-D2`, x = `Order in EP5_rcv`) %>% 
    dplyr::select(-c(x)) %>% 
    mutate_at(vars("dim1":"dim2"), ~ifelse(. == ".", NA, .)) %>% 
    mutate_at(vars("dim1":"dim2"), as.numeric)
  
  tests <- mp_ep_final %>% 
    select(session, id) %>% 
    distinct() %>% 
    group_by(id) %>% 
    summarize(count = n()) %>% 
    filter(count > 1) %>% 
    sample_n(50) %>% 
    pull(id)
  
  for (t in tests){
    print(unique(mp_ep_final$Name[mp_ep_final$id == t]))
  }

  # Get party name 
  party_info <- read_excel("data/ep/mep_info_26Jul11.xls", sheet = 8) %>% 
    rename(country = `Member State`, party = `Code`, party_fam = `Party Family`,
           party_name = `National Party`, party_abbrev = `Abbrev`)
  
  # Merge data set with MP's data
  mp_ep_final <- left_join(mp_ep_final, party_info, by=c("party", "country"))
  
  
  # Reorder data set
  mp_ep_final <- mp_ep_final %>% 
    dplyr::select(id, Name, country, party_name, party_abbrev, 
                  party_group, party_fam, party, dim1, dim2, session)
  
  rcv_ep <- list()
  rcv_ep[[1]] <- rcv_ep1
  rcv_ep[[2]] <- rcv_ep2
  rcv_ep[[3]] <- rcv_ep3
  rcv_ep[[4]] <- rcv_ep4
  rcv_ep[[5]] <- rcv_ep5
  
  save(mp_ep_final, rcv_ep, file = "data/ep/full_ep.Rda")
  rm(rcv_ep, mp_ep_final)